{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import os\n",
      "filename = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"blogs\", \"1005545.male.25.Engineering.Sagittarius.xml\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "all_posts = []\n",
      "with open(filename) as inf:\n",
      "    # remove leading and trailing whitespace\n",
      "    post_start = False\n",
      "    post = []\n",
      "    for line in inf:\n",
      "        line = line.strip()\n",
      "        if line == \"<post>\":\n",
      "            post_start = True\n",
      "        elif line == \"</post>\":\n",
      "            post_start = False\n",
      "            all_posts.append(\"\\n\".join(post))\n",
      "            post = []\n",
      "        elif post_start:\n",
      "            post.append(line)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "len(all_posts)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 4,
       "text": [
        "80"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}